import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import preprocessing
%matplotlib inline
import warnings
warnings.simplefilter('ignore')
# Importing Car Attributes
data1=pd.read_json("Part1 - Car-Attributes.json")
#Importing Car Names
data2=pd.read_csv("Part1 - Car name.csv")
data_part1_org=pd.concat([data1,data2], axis=1,sort=False) #Merging the separate datasets together.
data_part1_org.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mpg 398 non-null float64 1 cyl 398 non-null int64 2 disp 398 non-null float64 3 hp 398 non-null object 4 wt 398 non-null int64 5 acc 398 non-null float64 6 yr 398 non-null int64 7 origin 398 non-null int64 8 car_name 398 non-null object dtypes: float64(3), int64(4), object(2) memory usage: 28.1+ KB
data_part1_org.shape
(398, 9)
data_part1_org.size
3582
def identifying_str_entries(df):
output_entries={}
index=df.index
for col in df.columns:
entries=[]
for value in index:
if type(df[col].loc[value])==str:
entries.append(value)
output_entries.update({col:entries})
return output_entries
identifying_str_entries(data_part1_org.drop(["car_name"],axis=1))
{'mpg': [],
'cyl': [],
'disp': [],
'hp': [32, 126, 330, 336, 354, 374],
'wt': [],
'acc': [],
'yr': [],
'origin': []}
There seems to be certain values in "hp" column that seem to be not a number. Let us check out what it is.
unknown_entries=identifying_str_entries(data_part1_org.drop(["car_name"],axis=1))
for item in unknown_entries["hp"]:
#dataf_org["hp"].loc[item]=np.NaN
print(data_part1_org["hp"].loc[item])
? ? ? ? ? ?
Looks like they are question marks. So, let us replace them with NaN entries.
data_part1_org["hp"].replace("?",np.NaN,inplace=True)
data_part1_org["hp"].isna().sum()
6
Total 6 NaN entries in the table. Those can be replaced by something relevant. So, lets replace them with median values.
data_part1=data_part1_org.fillna(data_part1_org.median())
data_part1["hp"]=data_part1["hp"].astype("int64")
data_part1_l=data_part1.drop(["car_name"],axis=1) #Dropping name for further analysis easing.
data_part1_n=data_part1_l.copy()
# There are no NaN Values anywhere in here. So, moving ahead.
data_part1.isna().sum()
mpg 0 cyl 0 disp 0 hp 0 wt 0 acc 0 yr 0 origin 0 car_name 0 dtype: int64
#Miles per Gallon "mpg" is our target column. So, lets see how it goes.
sns.histplot(data_part1_org["mpg"])
<AxesSubplot:xlabel='mpg', ylabel='Count'>
Looks like the target columns is itself having two peaks implying possible two clusters of the data.
data_part1.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| mpg | 398.0 | 23.514573 | 7.815984 | 9.0 | 17.500 | 23.0 | 29.000 | 46.6 |
| cyl | 398.0 | 5.454774 | 1.701004 | 3.0 | 4.000 | 4.0 | 8.000 | 8.0 |
| disp | 398.0 | 193.425879 | 104.269838 | 68.0 | 104.250 | 148.5 | 262.000 | 455.0 |
| hp | 398.0 | 104.296482 | 38.224809 | 46.0 | 76.000 | 93.0 | 125.000 | 230.0 |
| wt | 398.0 | 2970.424623 | 846.841774 | 1613.0 | 2223.750 | 2803.5 | 3608.000 | 5140.0 |
| acc | 398.0 | 15.568090 | 2.757689 | 8.0 | 13.825 | 15.5 | 17.175 | 24.8 |
| yr | 398.0 | 76.010050 | 3.697627 | 70.0 | 73.000 | 76.0 | 79.000 | 82.0 |
| origin | 398.0 | 1.572864 | 0.802055 | 1.0 | 1.000 | 1.0 | 2.000 | 3.0 |
#Separating Discrete columns for continuous columns
cols_discrete=["cyl","yr","origin"]
cols_cont=["mpg","disp","hp","acc"]
#plot for Discrete Variables.
plt.figure(figsize=(20,6))
ki=1
for cols in cols_discrete:
print("\n",cols,"\n")
plt.figure(figsize=(20,6))
plt.title(cols)
plt.subplot(1,3,ki)
plt.title("Countplot")
sns.countplot(x=data_part1[cols])
ki+=1
plt.subplot(1,3,ki)
plt.title("Boxplot")
sns.boxplot(x=data_part1[cols])
ki+=1
plt.subplot(1,3,ki)
plt.title("Stripplot")
sns.stripplot(x=data_part1[cols])
ki+=1
if ki==4:
ki=1
plt.show()
plt.close()
cyl
<Figure size 1440x432 with 0 Axes>
yr
origin
Observations:
#plot for continuous variables.
plt.figure()
ki=1
for cols in cols_cont:
print("\n",cols,"\n")
plt.figure(figsize=(20,6))
#print(ki)
#plt.title("Distribution Plot")
plt.subplot(1,3,ki)
sns.distplot(x=data_part1[cols])
#plt.show()
#plt.close()
ki+=1
#plt.title("Box Plot")
plt.subplot(1,3,ki)
sns.boxplot(x=data_part1[cols])
#plt.show()
# plt.close()
ki+=1
#plt.title("Point Plot")
plt.subplot(1,3,ki)
sns.stripplot(x=data_part1[cols])
# plt.show()
#plt.close()
ki+=1
if ki==4:
ki=1
plt.show()
#plt.close()
#plt.figure(figsize=(12,6))
mpg
<Figure size 432x288 with 0 Axes>
disp
hp
acc
Observations:
plt.figure(figsize=(20,12))
sns.heatmap(data_part1.cov())
<AxesSubplot:>
sns.pairplot(data_part1,diag_kind="kde")
#so our columns are interest are disp,hp,wt. Given discrete columns i.e. year, cylinders and origin are not clear and distinct at all. They are completely overlapping indicating that there might be very little correlation between them.
<seaborn.axisgrid.PairGrid at 0x1b874da6df0>
Weight seems to be the main contributing factor behind a lot of things here (especially the target column) as it has significant covariance and relationship with other quantities of the dataset. But, further analysis is needed.
#some code to get columns ready.
###########################################
######################################################
#######################################################3
#######################################################
#########################################################
#Producing iterable list for indexing within a given range.
def produce_iterable_list_indexes(items_list):
k=len(columns_total)
p=0
iter_list_index=[]
while k>=0:
for i in range(p,k):
if p==i:
pass
iter_list_index.append((p,i))
#print(p,i)
p+=1
if p==k:
break
return iter_list_index
###########################################
######################################################
#######################################################3
#######################################################
#########################################################
#converting the given indexes into actual column labels as an iterable list.
columns_total=data_part1_n.columns
iterable_columns=[]
for index1,index2 in produce_iterable_list_indexes(columns_total):
if index1==index2:
pass
else:
#print(index1,index2)
iterable_columns.append((columns_total[index1],columns_total[index2]))
given_columns_to=iterable_columns
temp_var_iter=len(given_columns_to)
for index in range(0,temp_var_iter,2):
#fig=plt.figure(figsize=(20,20))
sns.jointplot(data=data_part1, x=given_columns_to[index][0], y=given_columns_to[index][1],hue="mpg")
#sns.pointplot(x=given_columns_to[index][0], y=given_columns_to["mpg"])
sns.jointplot(data=data_part1, x=given_columns_to[index+1][0], y=given_columns_to[index+1][1], hue="mpg")
#sns.pointplot(x=given_columns_to[index+1][0], y=given_columns_to["mpg"])
#fig,(k,r)=plt.subplots(nrows=1,ncols=2)
plt.show()
plt.close()
sns.pairplot(data=data_part1_n.groupby(data_part1_n['mpg']).mean())
<seaborn.axisgrid.PairGrid at 0x1b87731f9d0>
Notes: Looks like Acceleration does have a weak relationship with other variables within the table, espeically displacement and horsepower.
plt.figure(figsize=(12,6))
sns.heatmap(data_part1_n.corr(),annot=True)
plt.show()
plt.close()
plt.figure(figsize=(12,6))
sns.heatmap(data_part1_n.cov(),annot=True)
plt.show()
plt.close()
Observing all the above it is pretty clear that some columns like acceleration have got nothing to do direclty with the mileage or mpg. But "year", and "origin" seem to have been playing not so significant role in determining how much mileage is expected of a given vehicle, hence they can be ignored or simply not cared about.
There are four important columns "wt","disp","cyl","hp", which seem to have a good correlation with "mpg" column.
plt.figure(figsize=(20,6))
ki=1
interested_columns=iterable_columns[0:7]
for cols in interested_columns:
print("\n",cols[0]+" vs "+cols[1],"\n")
plt.figure(figsize=(20,6))
#print(cols[0],cols[1])
#plt.title(cols[0]+" vs "+cols[1])
plt.subplot(1,2,ki)
plt.title("Regression plot")
sns.pointplot(x=data_part1_n[cols[1]],y=data_part1[cols[0]])
ki+=1
plt.subplot(1,2,ki)
plt.title("stripplot")
sns.stripplot(x=data_part1_n[cols[1]],y=data_part1[cols[0]])
ki+=1
if ki==3:
ki=1
plt.show()
plt.close()
mpg vs cyl
<Figure size 1440x432 with 0 Axes>
mpg vs disp
mpg vs hp
mpg vs wt
mpg vs acc
mpg vs yr
mpg vs origin
Observations: It seems like Mileage holds a really good relationship with number of cylinders, with very little variance in the cylinders and also staying at the peak of the regression plot, implying that most amount of mileage producing number. So, that's interesting observations. It is very weak, but mileage seem to be increasing as the cars get more and mroe latest, but it is not perfectly right to ascertain it yet. On the other hand mileage seem to be strongly correlated with the origin of the car. Is it truly so ?
Horsepower though strongly correlated with mileage, it is distributed widely, leaving us to wonder if the relation can be clearly defined. But, compared to other variables, it is probably true to treat those outliers with replacement or just leave them as be, depending on how many there are.
Looking at the above, 4 clusters seem to be the most appropriate number for us to generate and follow here. Highest correlation observed with disp and wt columsn each having 4 and 5 different flavour/groups in their own distributions respectively. Thefore, I opt for 4 to be an optimal choice.
Before we proceed we need to process the data, scale it appropriately and then try to fit into the model.
data_part1_n.describe()
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| count | 398.000000 | 398.000000 | 398.000000 | 398.000000 | 398.000000 | 398.000000 | 398.000000 | 398.000000 |
| mean | 23.514573 | 5.454774 | 193.425879 | 104.296482 | 2970.424623 | 15.568090 | 76.010050 | 1.572864 |
| std | 7.815984 | 1.701004 | 104.269838 | 38.224809 | 846.841774 | 2.757689 | 3.697627 | 0.802055 |
| min | 9.000000 | 3.000000 | 68.000000 | 46.000000 | 1613.000000 | 8.000000 | 70.000000 | 1.000000 |
| 25% | 17.500000 | 4.000000 | 104.250000 | 76.000000 | 2223.750000 | 13.825000 | 73.000000 | 1.000000 |
| 50% | 23.000000 | 4.000000 | 148.500000 | 93.000000 | 2803.500000 | 15.500000 | 76.000000 | 1.000000 |
| 75% | 29.000000 | 8.000000 | 262.000000 | 125.000000 | 3608.000000 | 17.175000 | 79.000000 | 2.000000 |
| max | 46.600000 | 8.000000 | 455.000000 | 230.000000 | 5140.000000 | 24.800000 | 82.000000 | 3.000000 |
#Transforming the given dataset with score.
from scipy.stats import zscore
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
scaler1=zscore
scaler2=MinMaxScaler()
data_part1_n=scaler2.fit_transform(X=data_part1_n)
data_part1_n=pd.DataFrame(data_part1_n,columns=columns_total)
plt.figure(figsize=(20,12))
data_part1_n.boxplot()
plt.show()
plt.close()
#applying Zscore scaling for conitnuous columns
data_part1_n[cols_cont] =data_part1_n[cols_cont].apply(scaler1)
# Import Models and doing the actual fitting.
from sklearn.cluster import KMeans
from matplotlib import cm
cluster_km_range = range(1,12)
cluster_km_errors = []
for num_clusters in cluster_km_range:
km_clusters = KMeans(num_clusters, n_init = 5)
km_clusters.fit(data_part1_n)
labels = km_clusters.labels_
centroids = km_clusters.cluster_centers_
cluster_km_errors.append(km_clusters.inertia_)
clusters_km_df = pd.DataFrame({"num_clusters": cluster_km_range, "cluster_errors": cluster_km_errors})
print(clusters_km_df)
plt.figure(figsize=(20,12))
plt.plot(clusters_km_df.num_clusters, clusters_km_df.cluster_errors, marker = "x" )
plt.show()
plt.close()
num_clusters cluster_errors 0 1 1762.375087 1 2 805.491634 2 3 574.648273 3 4 464.433172 4 5 399.046606 5 6 345.972517 6 7 314.749043 7 8 291.170183 8 9 269.228861 9 10 253.279770 10 11 236.757300
4 Clusters seem to be apt for this particular purpose, loooking at the above picture. As discussed earlier, we will be proceeding with 4
#Fitting into the model
no_of_clusters=4
kmeans = KMeans(n_clusters=no_of_clusters, n_init = 5, random_state=123)
kmeans.fit(data_part1_n)
labels = kmeans.labels_
kmeans_counts = np.bincount(labels[labels>=0])
print(kmeans_counts)
[100 97 74 127]
centroids_km = kmeans.cluster_centers_
centroid_df = pd.DataFrame(centroids_km, columns = list(columns_total) )
centroid_df.transpose()
sns.pairplot(centroid_df)
plt.show()
Observations: Clusters don't seem to be too far away from each other given that there is so much less varied variation within the data, especially with respect to distributions.
data_part1_n
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| 0 | -0.706439 | 1.0 | 1.090604 | 0.673277 | 0.536150 | -1.295498 | 0.0 | 0.0 |
| 1 | -1.090751 | 1.0 | 1.503514 | 1.590065 | 0.589736 | -1.477038 | 0.0 | 0.0 |
| 2 | -0.706439 | 1.0 | 1.196232 | 1.197156 | 0.516870 | -1.658577 | 0.0 | 0.0 |
| 3 | -0.962647 | 1.0 | 1.061796 | 1.197156 | 0.516019 | -1.295498 | 0.0 | 0.0 |
| 4 | -0.834543 | 1.0 | 1.042591 | 0.935216 | 0.520556 | -1.840117 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 393 | 0.446497 | 0.2 | -0.513026 | -0.479257 | 0.333711 | 0.011586 | 1.0 | 0.0 |
| 394 | 2.624265 | 0.2 | -0.925936 | -1.369851 | 0.146583 | 3.279296 | 1.0 | 0.5 |
| 395 | 1.087017 | 0.2 | -0.561039 | -0.531645 | 0.193365 | -1.440730 | 1.0 | 0.0 |
| 396 | 0.574601 | 0.2 | -0.705077 | -0.662615 | 0.286929 | 1.100822 | 1.0 | 0.0 |
| 397 | 0.958913 | 0.2 | -0.714680 | -0.584033 | 0.313864 | 1.391285 | 1.0 | 0.0 |
398 rows × 8 columns
# Add cluster number to original cars data
predict_km = kmeans.predict(data_part1_n)
data_part1_n_km=data_part1_n.copy()
data_part1_n_km["group"] = predict_km
data_part1_n_km['group'] = data_part1_n_km['group'].astype('category')
data_part1_n_km.dtypes
mpg float64 cyl float64 disp float64 hp float64 wt float64 acc float64 yr float64 origin float64 group category dtype: object
data_part1["group"] = predict_km #adding K-Means label to original Dataframe
plt.figure(figsize=(20,12))
data_part1_n_km.boxplot(by = 'group', layout=(3,4), figsize=(15, 10))
plt.show()
plt.close()
<Figure size 1440x864 with 0 Axes>
Observations:
Most of the times it seems like the 3rd group holds most number of outliers for almost all of the features. That is very intersting to consider. That might possibly indicate that there needs to be perhaps a little different kind of model or relationship happening in that particular cluster between datapoints. We might want to use a different model for trainig. Perhaps even reduce the nubmer of clusters.
from sklearn.cluster import AgglomerativeClustering
AC_cluster = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='average')
AC_cluster.fit(data_part1_n)
data_part1_n_ac=data_part1_n.copy()
data_part1_n_ac["group"]=AC_cluster.labels_
#data_part1_n_ac["group"]
temporary=data_part1_n_ac.groupby(["group"])
print(temporary)
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001B87F9A16D0>
temporary.mean()
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| group | ||||||||
| 0 | 0.336007 | 0.329054 | -0.475508 | -0.472355 | 0.278993 | 0.310635 | 0.559685 | 0.378378 |
| 1 | -1.151105 | 0.995876 | 1.484507 | 1.506352 | 0.717598 | -1.062679 | 0.302405 | 0.000000 |
| 2 | 2.585833 | 0.200000 | -0.976350 | -1.448433 | 0.147647 | 2.652985 | 0.833333 | 0.500000 |
| 3 | 1.855641 | 0.600000 | 0.658488 | -0.505451 | 0.397505 | 0.519896 | 1.000000 | 0.000000 |
data_part1_n_ac
| mpg | cyl | disp | hp | wt | acc | yr | origin | group | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.706439 | 1.0 | 1.090604 | 0.673277 | 0.536150 | -1.295498 | 0.0 | 0.0 | 1 |
| 1 | -1.090751 | 1.0 | 1.503514 | 1.590065 | 0.589736 | -1.477038 | 0.0 | 0.0 | 1 |
| 2 | -0.706439 | 1.0 | 1.196232 | 1.197156 | 0.516870 | -1.658577 | 0.0 | 0.0 | 1 |
| 3 | -0.962647 | 1.0 | 1.061796 | 1.197156 | 0.516019 | -1.295498 | 0.0 | 0.0 | 1 |
| 4 | -0.834543 | 1.0 | 1.042591 | 0.935216 | 0.520556 | -1.840117 | 0.0 | 0.0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 393 | 0.446497 | 0.2 | -0.513026 | -0.479257 | 0.333711 | 0.011586 | 1.0 | 0.0 | 0 |
| 394 | 2.624265 | 0.2 | -0.925936 | -1.369851 | 0.146583 | 3.279296 | 1.0 | 0.5 | 2 |
| 395 | 1.087017 | 0.2 | -0.561039 | -0.531645 | 0.193365 | -1.440730 | 1.0 | 0.0 | 0 |
| 396 | 0.574601 | 0.2 | -0.705077 | -0.662615 | 0.286929 | 1.100822 | 1.0 | 0.0 | 0 |
| 397 | 0.958913 | 0.2 | -0.714680 | -0.584033 | 0.313864 | 1.391285 | 1.0 | 0.0 | 0 |
398 rows × 9 columns
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage
from scipy.spatial.distance import pdist
Z = linkage(data_part1_n, metric='euclidean', method='complete')
c, coph_dists = cophenet(Z , pdist(data_part1_n))
plt.figure(figsize=(20, 12))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
dendrogram(Z, leaf_rotation=90.,color_threshold = 40, leaf_font_size=8. )
plt.tight_layout()
c
#cophenetic coefficient.
0.7188757030512185
plt.figure(figsize=(20, 12))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
dendrogram(Z)
plt.tight_layout()
dendrogram(Z,truncate_mode='lastp',p=no_of_clusters)
plt.show()
#K means clustering for n=4.
print(kmeans_counts)
[100 97 74 127]
Notice that they produce radically different amount of clusters with same number of elements. It is really surprising here. Why did this happen ?
The possible reason I think behind them being different is because of some irrelevant columns within the data. The Weightage they put in for dendrogram is very high in comparison to the K-Means Clustering. In K-means they show their influence too, but there is an intercluster distance there and the method chosen i.e. the linkage we choose might vilify a lot of issues that might not be the case with dendrogram. Perhaps if we can change the method of linkage to average, instead of complete, we will have more the same, identical results ?
max_distance=49
from scipy.cluster.hierarchy import fcluster
clusters = fcluster(Z, max_distance, criterion='distance')
clusters
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1], dtype=int32)
for item in iterable_columns[0:7]:
print(item)
plt.scatter(data_part1_n[item[0]], data_part1_n[item[1]], c=clusters) # plot points with cluster dependent colors
plt.show()
plt.close()
('mpg', 'cyl')
('mpg', 'disp')
('mpg', 'hp')
('mpg', 'wt')
('mpg', 'acc')
('mpg', 'yr')
('mpg', 'origin')
It is pretty obvious that there are irrelevant features everywhere in this. The main features which can be eliminated are, acc and wt.
#dealing with Linear Regression Model as it takes input from various clusters.
###################################################################
##################################################################
#################################################################
##################################################################3
###################################################################
def linear_model_fitting(data_in,column_given):
data_out={}
from sklearn.linear_model import LinearRegression
#print(data_in)
x=data_in.drop([column_given],axis=1)
#print(x)
y=data_in[column_given]
#print(y)
regression_model = LinearRegression()
regression_model.fit(x, y)
#print(regression_model.coef_)
model_score=regression_model.score(x, y)
#print([k for k in enumerate(x.columns)])
for idx, col_name in enumerate(x.columns):
#print(idx,col_name)
#regression_model.coef_
data_out.update({col_name:regression_model.coef_[idx]})
return model_score,data_out
#Developing a dataframe such that we can fit regression model into each of the gorup or cluster.
###################################################################
##################################################################
#################################################################
##################################################################3
###################################################################
def details_fitting(given_df,column_interest,no_of_groups):
info_each_cluster=[]
for ind in range(0,no_of_groups):
temp_data=given_df[given_df["group"]==ind]
#print(temp_data)
send_data=temp_data.drop(["group"],axis=1)
#print(send_data)
#print(temp_data.columns)
info_each_cluster.append(linear_model_fitting(send_data,column_interest))
return info_each_cluster
###################################################################
##################################################################
#################################################################
##################################################################3
###################################################################
#KMeans Clustering Regression model and its coefficients for each feature within the model for each cluster
pd.DataFrame([dd[1] for dd in details_fitting(given_df=data_part1_n_km,column_interest="mpg",no_of_groups=no_of_clusters)])
| cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|
| 0 | 0.160705 | -0.007612 | 0.021539 | -1.831290 | 0.038335 | 0.613281 | -0.221079 |
| 1 | -0.161794 | -0.056799 | -0.093033 | -0.559321 | -0.168283 | 0.608527 | 0.000000 |
| 2 | 2.758723 | -0.059680 | -0.188852 | -3.998937 | 0.144612 | 1.747514 | 0.226307 |
| 3 | 0.385283 | 0.297946 | -0.229114 | -3.951076 | -0.005121 | 1.075723 | 0.026641 |
Note : Above mentions how models are fit into the clusters and what is the Regression coefficient for that particular cluster in that particular feature.
#Hierarchical Clustering Regression model and its coefficients for each feature within the model for each cluster.
pd.DataFrame([dd[1] for dd in details_fitting(given_df=data_part1_n_ac,column_interest="mpg",no_of_groups=no_of_clusters)])
| cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|
| 0 | 4.109818e-01 | -0.122020 | -0.373255 | -2.834873 | -0.040523 | 1.250812 | 0.194896 |
| 1 | -1.617945e-01 | -0.056799 | -0.093033 | -0.559321 | -0.168283 | 0.608527 | 0.000000 |
| 2 | 2.775558e-16 | -0.059655 | -0.092987 | 0.204494 | -0.178733 | 0.965434 | 0.000000 |
| 3 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
Please Note: Trials to make different models on different clusters is done here.
# We are going to use only labels generated by K-Means Clustering.
###################################################################
##################################################################
#################################################################
##################################################################3
###################################################################
#This function will generate the data for each model
def model_fitting(data_in,column_given,list_of_models):
data_out_major={}
x=data_in.drop([column_given],axis=1)
lab_enc=preprocessing.LabelEncoder()
y=lab_enc.fit_transform(data_in[column_given])
#print(y)
#iterable=enumerate(x.columns)
for model in list_of_models:
#data_out={}
k=model
k.fit(x,y)
#print(regression_model.coef_)
model_score=k.score(x, y)
#print(model_score)
data_out_major.update({k:model_score})
k=None
return data_out_major
#############################################################################
##############################################################################
##############################################################################
############################################################################
#############################################################################
#this will generate details for each cluster and each model as per the given models as function, full featured and configured insdie the list.
def details_fitting(given_df,column_interest,no_of_groups,models_list):
info_each_cluster=[]
for ind in range(0,no_of_groups):
temp_data=given_df[given_df["group"]==ind]
#print(temp_data)
send_data=temp_data.drop(["group"],axis=1)
#print(send_data)
#print(temp_data.columns)
info_each_cluster.append(model_fitting(send_data,column_interest,list_of_models=models_list))
return info_each_cluster
#############################################################################
##############################################################################
##############################################################################
############################################################################
#############################################################################
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
random_forest = RandomForestClassifier(n_estimators = 50, random_state=1,max_features=12)
gradientboost = GradientBoostingClassifier(n_estimators = 10,random_state=1)
adaboost=AdaBoostClassifier(n_estimators=10,random_state=1)
bagging=BaggingClassifier(n_estimators=50,random_state=1)
decision_tree=DecisionTreeClassifier(criterion = 'entropy')
support_vector = SVC(C=10000,probability=True)
KNN = KNeighborsClassifier(n_neighbors= 5 , weights = 'distance') #Change the Number of neighbours for better modeling
NB_model = GaussianNB()
list_models=[adaboost,decision_tree,support_vector]
# We are choosing the number of clusters here for our convenience
no_of_clusters=4
analysis_model=details_fitting(given_df=data_part1_n_km,column_interest="mpg",no_of_groups=no_of_clusters,models_list=list_models)
analysis_model
[{AdaBoostClassifier(n_estimators=10, random_state=1): 0.15,
DecisionTreeClassifier(criterion='entropy'): 1.0,
SVC(C=10000, probability=True): 1.0},
{AdaBoostClassifier(n_estimators=10, random_state=1): 0.23711340206185566,
DecisionTreeClassifier(criterion='entropy'): 1.0,
SVC(C=10000, probability=True): 0.979381443298969},
{AdaBoostClassifier(n_estimators=10, random_state=1): 0.1891891891891892,
DecisionTreeClassifier(criterion='entropy'): 1.0,
SVC(C=10000, probability=True): 1.0},
{AdaBoostClassifier(n_estimators=10, random_state=1): 0.14173228346456693,
DecisionTreeClassifier(criterion='entropy'): 1.0,
SVC(C=10000, probability=True): 1.0}]
for cluster in range(len(analysis_model)):
print("\nCluster Number chosen :",cluster)
for model in list(analysis_model[cluster]):
print("\nModel :",model)
print("\nModel Score :",analysis_model[cluster][model])
Cluster Number chosen : 0 Model : AdaBoostClassifier(n_estimators=10, random_state=1) Model Score : 0.15 Model : DecisionTreeClassifier(criterion='entropy') Model Score : 1.0 Model : SVC(C=10000, probability=True) Model Score : 1.0 Cluster Number chosen : 1 Model : AdaBoostClassifier(n_estimators=10, random_state=1) Model Score : 0.23711340206185566 Model : DecisionTreeClassifier(criterion='entropy') Model Score : 1.0 Model : SVC(C=10000, probability=True) Model Score : 0.979381443298969 Cluster Number chosen : 2 Model : AdaBoostClassifier(n_estimators=10, random_state=1) Model Score : 0.1891891891891892 Model : DecisionTreeClassifier(criterion='entropy') Model Score : 1.0 Model : SVC(C=10000, probability=True) Model Score : 1.0 Cluster Number chosen : 3 Model : AdaBoostClassifier(n_estimators=10, random_state=1) Model Score : 0.14173228346456693 Model : DecisionTreeClassifier(criterion='entropy') Model Score : 1.0 Model : SVC(C=10000, probability=True) Model Score : 1.0
• Weight of the vehicle is negatively correlated with vehicles. But the correlation is negative. That means the vehicle manufacturers should reduce the weight of the vehicle, implying that not to compromise on stability, instead of using heavy metals, they might be using light weight alloys that does more or less the same kind of work.
• The optimal cylinders seems to be 4 cylinders so that mileage has a minimum and mean better than other vehicles.
• Horse power also has negative correlation with mileage. Horse power indicates the engine power. If we want to see properly, we have proper positive correlation between horsepower and weight clustered around the minimum point.
• The mileage seems to be increasing as time passes, but doesn't look like the increase at a rate that is desirable.
• To get the maximum mileage, the company should concentrate on putting 4 cylinder machines, that are light weight and higher horse power.
• The point of origin of the car seem to be of much less important than what is given here. Mileage of the car depends on how well maintained the car is, the profile of the car which includes the target of the car, the places it is driven in etc.
• So, it would be better if the company could separate the line of production i.e. Target differentiation while product is being manufacturing or during the planning stage itself so that different characteristics are enhanced for different purposes. E.g. Smaller or mileage oriented vehicles designed with light weight metals and capped top speed will increase vehicles. Separate those models from the high acceleration vehicles that are designed more faster freeways whose concentration won't be on the mileage.
• Company, if it wants to improve its sales, it should concentrate on generating more data focused on the outside shape i.e. Aerodynamic characteristics, ground clearance and axel configurations and wheels used along with Tyres, because they are the important factors in determining how much time vehicle spends at any particular speed and thus the mileage and its predictability improvement.
• While taking data, the company should concentrate more on how average mileage is changing with age of the vehicle. If a particular vehicle is performing well despite a long age, perhaps there is some special feature in the vehicle that might be enhancing its longevity and consistency in its performance. Hence, those readings would definitely help.
• Along with it perhaps the generalized survey of the areas where the vehicles were sold (i.e. pincodes) and the condition of roads in those areas might also help a lot in diagnosing what feature might be enhanced to what kind of area so that they give maximum yeild.
• Different models for different clusters is a bit too much to ask for such a small data set, however, we can clearly see that some models are fitting with a better score on certain clusters than other clusters. That is the proof that there is a score for optimization in future.
#Exporting the data for further analysis
from pandas import ExcelWriter
writer = ExcelWriter('Part1 - Car-Attributes-further.xls')
data_part1_n_ac.to_excel(writer,'Sheet1')
writer.save()